# import all the libraries
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import time
from joblib import dump, load
df = pd.read_csv('craft_demo_sample.csv')
print('The number of records: {}'.format(df.shape[0]))
print('The number of fields: {}'.format(df.shape[1]))
# check data columns and types
print(df.dtypes)
# describe the numerical variable
df.describe(include=[np.number])
# describe the categorical variable
df.describe(include=[np.object])
# we can only see the slightly correlation between page_visited with session_duration, page_visited with login_page_flag, and login_page_flag with customer
df.corr()
# drop duplicates if any
rows_original = df.shape[0]
print('Before dropping duplicates, the shape is {}'.format(df.shape))
data_df = df.drop_duplicates()
print('After dropping duplicates, the shape is {}'.format(data_df.shape))
print('There are total {} rows dropped'.format(rows_original - data_df.shape[0]))
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
missing_value_df
# for some column, we cannot get any information from it, choose to delete it
df = df.drop(columns = ['monthofyear', 'qbo_signup_time', 'prospect', 'visitor_session_num'])
df['session_num'] = df['session_num'].apply(lambda x: x.split(':')[1])
df['session_num'] = df['session_num'].astype(int)
df['qbo_canceled'] = df['qbo_cancel_date'].isnull() == False
# the features can be seperated into several groups
continuous_col = ['session_num', 'session_duration_second', 'pages_visited']
# histogram for continuous varibes
df.hist(column = continuous_col, figsize = (10,8))
# for these 3 variables, they are all
df['session_num'].hist(bins = 100)
plt.title('Histogram of session_num')
sum(df['session_num'] == 3)/len(df)
# what time do try visit the website
time_columns = ['hourofday', 'dayofweek', 'dayofmonth']
for col in time_columns:
plt.figure(figsize = (8,5))
sns.countplot(x=col, data=df)
# people prefer to check our app during the woking time of the work days
# total visitors
len(df['visitor_id'].unique())
df.columns
# where are they from
geo_columns = ['geo_country', 'region', 'city']
len(df[df['geo_country'] == 'usa'])/len(df) * 100
df_usa_count = df[df['geo_country'] == 'usa'][['region', 'visit_date']].groupby(['region']).count().reset_index()
df_usa_count['region'] = df_usa_count['region'].str.upper()
import plotly.express as px
fig = px.choropleth(locations=list(df_usa_count['region']), locationmode="USA-states",
color=list(df_usa_count['visit_date']), scope="usa")
fig.show()
# most of usa visitors are from california, taxes and florida
# what kind of devices
dev_column = ['device', 'operating_system']
df.groupby(dev_column).count()['visit_date']
df['device'].value_counts().plot.pie(figsize = (8,8))
plt.legend(list(df['device'].value_counts()))
len(df[(df['browser'] == 'Chrome')])/len(df)
df['browser'].value_counts().plot.bar(figsize = (8,6))
plt.legend(list(df['device'].value_counts()))
df[df['device'] == 'Mobile'].groupby(['device', 'operating_system', 'browser']).count()['visitor_id']/10412
df['browser'][df['device'] == 'Mobile'].value_counts().plot.bar(figsize = (10,8))
len(df[df['browser'] == 'Chrome'])/len(df) * 100
df['language_desc_processed'] = df['language_desc'].apply(lambda x: str(x).split(' ')[0])
len(df[df['language_desc_processed'] == 'English'])/len(df) * 100
df['language_desc_processed'][df['language_desc_processed']!='English'].value_counts().plot.bar(figsize = (12,8))
df['visitor_type'].value_counts().plot.pie(figsize = (8,8))
plt.legend(list(df['visitor_type'].value_counts()))
df['customer'].value_counts().plot.pie(figsize = (8,8))
plt.legend(list(df['customer'].value_counts()))
df['employee_size'].value_counts().plot.bar(figsize = (12,8))
df['employee_size'].unique()
mini_size = ['1', '2-4', '1-4']
small_medium = ['11-20', '21-30', '31-50']
medium = ['101-250', '251-500', '500-1000']
def company_size(num):
if num in mini_size:
return '1-4'
if num in small_medium:
return '11-50'
if num in medium:
return '101-1000'
else:
return num
df['employee_size_processed'] = df['employee_size'].apply(company_size)
df['employee_size_processed'][df['employee_size_processed'] != 'other'].value_counts().plot.bar(figsize = (12,8))
df['employee_size_processed'][df['employee_size_processed'] != 'other'].unique()
df['revenue_size'][df['revenue_size'] != 'other'].value_counts().plot.bar(figsize = (12,8))
df['industry'][df['industry'] != 'other'].value_counts().plot.bar(figsize = (12,8))
len(df[df['industry'] == 'DnB - Business Services'])/len(df)
df['year_in_business'][df['year_in_business'] != 'other'].value_counts().plot.bar(figsize = (12,8))
df['channel'][df['channel'] != 'other'].value_counts().plot.bar(figsize = (12,8))
df['search_type'].value_counts().plot.pie(figsize = (8,8))
sum(df['landing_page'] == 'Homepage')/len(df)
df['landing_page'].value_counts().plot.pie(figsize = (8,8))
df['referrer_new'] = df['referrer'].apply(lambda x: str(x).replace('https://', '').replace('www.', '').replace('.com', '').replace('http://', ''))
df['referrer_new'] = df['referrer_new'].apply(lambda x: x.replace('/', ' ').replace('.', ' ').replace(' ', ' '))
def google_bing(link):
if 'google' in link:
return 'google'
if 'bing' in link:
return 'bing'
if 'intuit' in link:
return 'intuit'
else:
return link
df['referrer_new'] = df['referrer_new'].apply(google_bing)
df['referrer_new'].value_counts()['google']
sum(df['referrer_new'] == 'google')/len(df)
sum(df['referrer_new'] == 'intuit')/len(df)
from wordcloud import WordCloud
comment_words = ''
for val in df.referrer_new:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
if 'google' in tokens[i]:
tokens[i] = 'google'
comment_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = ['nan', 'https://', 'www', 'com', 'http'],
min_font_size = 10).generate(comment_words)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
df['care_user_click'] = df['care_user_click'].fillna('N')
sum(df['care_user_click'] == 'Y')/len(df)
len(df[df['login_page_flag'] == 1])/len(df)
df['qbo_existing_customer_type'] = df['qbo_existing_customer_type'].str.replace('Trial', 'Free')
df_free = df[df['qbo_existing_customer_type'] == 'Free']
df['group'] = np.nan
df['group'][df['qbo_existing_customer_type'] == 'Free'] = 'Free'
df_paid = df[(df['qbo_existing_customer_sku'].isnull() == False) & (df['qbo_existing_customer_type'] != 'Free')]
df['group'][(df['qbo_existing_customer_sku'].isnull() == False) & (df['qbo_existing_customer_type'] != 'Free')] = 'Paid'
df_random_check = df[(df['qbo_existing_customer_type'].isnull()) & (df['visitor_type'] == 'First Time') & (df['qbo_existing_customer_type'] != 'Free')]
df['group'] [(df['qbo_existing_customer_type'].isnull()) & (df['visitor_type'] == 'First Time') & (df['qbo_existing_customer_type'] != 'Free')]= 'Random'
df_attention_check = df[(df['qbo_existing_customer_type'].isnull()) & (df['visitor_type'] == 'Return') & (df['qbo_existing_customer_type'] != 'Free')]
df['group'][(df['qbo_existing_customer_type'].isnull()) & (df['visitor_type'] == 'Return') & (df['qbo_existing_customer_type'] != 'Free')] = 'Attention'
len(df_free) + len(df_attention_check) + len(df_random_check) + len(df_paid)
len(df)
df.columns
group = dict((round(df.groupby(['group']).count()['visit_date']/len(df)*100, 2)))
group
labels = group.keys()
sizes = group.values()
fig1, ax1 = plt.subplots(figsize = (8,8))
ax1.pie(sizes, labels=labels, autopct='%1.2f%%',
shadow=False, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
df.groupby('group').median()[['session_num', 'session_duration_second', 'pages_visited']].reset_index()
df.groupby('group').mean()[['session_num', 'session_duration_second', 'pages_visited']].reset_index()
df.columns
df.groupby(['group'])['language_desc_processed', 'employee_size_processed', 'referrer_new', 'care_article_id',
'care_article_title', 'care_user_click','landing_page'].agg(pd.Series.mode)
df.groupby(['group'])['browser', 'resolution_desc', 'geo_country', 'region', 'city',
'device', 'operating_system','language_desc', 'connection_type_desc'].agg(pd.Series.mode)
df.groupby(['group'])['revenue_size', 'year_in_business', 'industry', 'annual_revenue',
'channel', 'search_type', 'landing_page', 'referrer'].agg(pd.Series.mode)
df.groupby(['group'])['channel', 'care_article_title', 'care_user_click'].agg(pd.Series.mode).reset_index()
def process_operat_system(v):
if v in ['Chrome OS', 'Not Specified', 'Symbian', 'Mobile Linux', 'Tizen']:
return 'Other'
else:
return v
df['processed_operating_system'] = df['operating_system'].apply(process_operat_system)
rare_values = list(df['browser'].value_counts()[df['browser'].value_counts()<1000].index)
def process_browser(v):
if v in rare_values:
return 'other'
else:
return v
df['processed_browser'] = df['browser'].apply(process_browser)
rare_values = list(df['geo_country'].value_counts()[df['geo_country'].value_counts()<1000].index)
def process_geo_country(v):
if v in rare_values:
return 'other'
else:
return v
df['processed_geo_country'] = df['geo_country'].apply(process_geo_country)
rare_values = list(df['language_desc_processed'].value_counts()[df['language_desc_processed'].value_counts()<1000].index)
def process_language(v):
if v in rare_values:
return 'other'
else:
return v
df['processed_language_desc'] = df['language_desc_processed'].apply(process_language)
rare_values = list(df['referrer_new'].value_counts()[df['referrer_new'].value_counts()<1000].index)
def process_referrer_new(v):
if v in rare_values:
return 'other'
else:
return v
df['processed_referrer_new'] = df['referrer_new'].apply(process_referrer_new)
features = [
'hourofday', 'dayofweek','dayofmonth', 'session_num', 'device', 'processed_operating_system',
'processed_browser','processed_geo_country','processed_language_desc', 'connection_type_desc',
'session_duration_second', 'pages_visited', 'visitor_type', 'employee_size_processed', 'revenue_size',
'year_in_business', 'channel', 'processed_referrer_new', 'landing_page', 'care_user_click',
'login_page_flag'
]
df_random = df[(df['qbo_signup_date'].isnull())]
attention_user = list(df['visitor_id'][df['qbo_signup_date'] == df['visit_date']])
df_attention = df[(df['visitor_id'].isin(attention_user))]
df_random['predict'] = 0
df_attention['predict'] = 1
df_new = df_attention.append(df_random)
percent_missing = df_new[features].isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
missing_value_df
df_new['device'].fillna(df_new['device'].mode()[0], inplace=True)
df_new['session_duration_second'].fillna(df_new['session_duration_second'].mean(), inplace=True)
df_new['year_in_business'].fillna(df_new['year_in_business'].mode()[0], inplace=True)
df_new['care_user_click'] = df_new['care_user_click'].apply(lambda x: 1 if x == 'Y' else 0)
# categorical_features = [ 'device', 'processed_operating_system',
# 'processed_browser','processed_geo_country','processed_language_desc', 'connection_type_desc','employee_size_processed', 'revenue_size',
# 'year_in_business', 'channel', 'processed_referrer_new', 'landing_page', 'visitor_type']
# for feature in categorical_features:
# features.remove(feature)
# numerical_features = features
df_cate = pd.get_dummies(df_new[categorical_features])
y = df_new['predict']
df_new['predict'].value_counts()
X = pd.concat([df_cate, df_new[numerical_features]], axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)
X_train = X_train[y_train == 1].append(X_train[y_train == 0].sample(500, random_state = 0))
y_train = y_train[y_train == 1].append(y_train[y_train == 0].sample(500, random_state = 0))
corr = X.corr()
plt.figure(figsize=(12,10))
heat = sns.heatmap(data=corr)
plt.title('Heatmap of Correlation')
class ML_models():
def __init__(self, x_train, y_train, x_test, y_test, model, model_name, model_param):
self.x_train = x_train # x train
self.y_train = y_train # y train
self.x_test = x_test # x test
self.y_test = y_test # y test
self.model = model # model name in Python
self.model_name = model_name # model name save in models/
self.model_param = model_param # hyperparameter in model
def train(self):
'''train the model'''
global model
model = self.model + "()"
model = eval(model)
# use GridSearchCV tuning the hyperparameter
model_grid = GridSearchCV(estimator = model,
param_grid = self.model_param,
scoring= 'recall', # multiclass classification
n_jobs=4,
cv=5)
# record the start tuning time
start_time_grid = time.time()
print("Start training...")
# start tuning model
model_grid.fit(self.x_train, self.y_train)
# calculate total GridSearchCV time
total_time_grid = time.time() - start_time_grid
print("The total GridSearchCV training time:", round(total_time_grid, 5))
print("The best hyperparameter:", model_grid.best_params_)
print("The best score of GridSearchCV training:", round(model_grid.best_score_, 5))
print("<--------------------------------------->")
# set the best tuning model parameter
model.set_params(**model_grid.best_params_)
# record the start train time
start_time_train = time.time()
# fit the model using the whole data set
model.fit(self.x_train, self.y_train)
# save model
dump(model, 'models/' + self.model_name + '.joblib')
# calculate the mean and std of the cv error
model_score = cross_val_score(model, self.x_train, self.y_train,
cv = 5,
# using mean_squared_error if it is regression.
scoring = 'recall')
# calculate total re-train time
time_retrain = time.time() - start_time_train
print("The re-train time:", round(time_retrain, 5))
print("The mean of CV score of the best training model:", round(model_score.mean(), 5))
print("The std of CV score of the best training model:", round(model_score.std(), 5))
return model_score.mean(), model_score.std(), model
logistic_reg_params = {"penalty": ["l2"], # penalization
"C": [1, 2, 3],
# inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
"tol": [1e-4, 1e-5], # tolerance for stopping criteria
"solver": ["newton-cg", "lbfgs", "sag"], # only ‘newton-cg’, ‘sag’, and ‘lbfgs’ handle multinomial loss and only handle L2 penalty
"multi_class": ["multinomial"], # here is multi-class
"random_state": [0]}
logistic_reg = ML_models(X_train,
y_train,
X_test,
y_test,
"LogisticRegression",
"logistic_reg",
model_param = logistic_reg_params)
logistic_reg_train = logistic_reg.train()
from sklearn.ensemble import RandomForestClassifier
random_forest_params = {'bootstrap': [True, False],
'max_depth': [10, 40, 50, 90, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 8,10],
'n_estimators': [50, 100, 200, 300, 400, 500]}
random_forest = ML_models(X_train,
y_train,
X_test,
y_test,
"RandomForestClassifier",
"random_forest",
model_param = random_forest_params)
from sklearn.ensemble import RandomForestClassifier
random_forest_train = random_forest.train()
reg_model = load('models/logistic_reg.joblib')
tree_model = load('models/random_forest.joblib')
np.mean(reg_model.predict(X_test) == y_test)
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y_test, reg_model.predict(X_test))
skplt.metrics.plot_confusion_matrix(y_test, y_pred)
y_pred = tree_model.predict_proba(X_test)[:,1] > 0.2
feature_importance = abs(tree_model.feature_importances_)[:20]
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
featfig = plt.figure(figsize = (12,10))
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(X_train.columns)[sorted_idx][:20], fontsize=8)
featax.set_xlabel('Relative Feature Importance')
plt.tight_layout()
plt.show()